Return to Data Page

import numpy as np
from numpy import random
import pandas as pd

from scipy import stats
from scipy.stats import norm #normal
from scipy.stats import genextreme as gev #generalized extreme value
from scipy.stats import pareto #pareto

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

\(\color{darkblue}{\textbf{Probability}}\)


\(\color{dodgerblue}{\textbf{Binary}}\)

np.random.seed(613)
pass_fail = stats.bernoulli.rvs(p = 0.7, size = 100)

pass_fail
array([0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1])


\(\color{skyblue}{\textrm{- Probabilities}}\)

Probability of Success = p

round(sum(pass_fail) / len(pass_fail), 2)
0.62

Probabilty of Failure = q = 1 - p

round(1 - (sum(pass_fail) / len(pass_fail)), 2)
0.38


\(\color{skyblue}{\textrm{- Distribution}}\)

pass_fail = np.array(pass_fail, dtype=str)

sns.histplot(pass_fail,
             color = 'darkblue')
plt.show()


\(\color{dodgerblue}{\textbf{Discrete}}\)

grade = np.random.choice(np.array(['A','B','C','F']), 100, p=[0.15,0.55,0.2,0.1])

grade
array(['B', 'C', 'B', 'C', 'B', 'A', 'A', 'B', 'F', 'C', 'A', 'B', 'B',
       'B', 'B', 'C', 'B', 'B', 'A', 'B', 'A', 'A', 'B', 'B', 'B', 'B',
       'A', 'B', 'A', 'A', 'B', 'F', 'C', 'B', 'B', 'B', 'F', 'B', 'C',
       'A', 'B', 'B', 'C', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'B', 'B',
       'A', 'A', 'A', 'C', 'B', 'F', 'B', 'B', 'B', 'B', 'B', 'C', 'B',
       'A', 'A', 'F', 'A', 'B', 'B', 'C', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'F', 'F', 'A', 'A', 'C', 'B', 'B', 'A',
       'B', 'B', 'B', 'B', 'B', 'A', 'B', 'B', 'B'], dtype='<U1')


\(\color{skyblue}{\textrm{- Probabilities}}\)

P = {}

P["A"] = sum(grade == "A") / len(grade)
P["B"] = sum(grade == "B") / len(grade)
P["C"] = sum(grade == "C") / len(grade)
P["F"] = sum(grade == "F") / len(grade)

P
{'A': 0.2, 'B': 0.61, 'C': 0.12, 'F': 0.07}


\(\color{skyblue}{\textrm{- Distribution}}\)

grade.sort()

sns.histplot(grade,
             color = 'darkblue')
plt.show()


\(\color{dodgerblue}{\textbf{Continuous}}\)

score = np.random.normal(loc=60, scale=15, size=100)

score
array([49.42178966, 60.11318944, 57.52655737, 55.18500198, 83.58669807,
       60.60611116, 62.84955862, 91.88147056, 58.18561255, 44.43607505,
       56.52203474, 50.3039921 , 73.73464322, 70.39393145, 71.64669984,
       68.42587924, 28.40244391, 38.53671815, 66.48988439, 63.57931322,
       52.35943507, 51.69438108, 72.61734787, 68.00083258, 32.2147696 ,
       62.812808  , 90.66467772, 40.40392912, 80.68296546, 54.57851271,
       52.43785367, 49.71713692, 44.48677425, 34.22687865, 79.11074983,
       33.04023258, 55.9149938 , 63.05036148, 78.04091144, 46.51584864,
       59.93718497, 73.32257878, 52.59278977, 76.8084071 , 74.4950962 ,
       67.11017229, 55.39331216, 56.46019432, 56.90356894, 43.26492979,
       73.47407378, 55.42948985, 60.05844776, 36.53838227, 55.98379844,
       63.00929694, 68.01131318, 70.25352878, 71.27606349, 54.99293147,
       75.80943182, 47.86434823, 44.67846184, 45.20892105, 54.01298278,
       37.86224058, 70.24213513, 64.20159803, 88.30329909, 38.15488611,
       35.13638258, 54.26145695, 48.48772986, 74.51270407, 53.70828047,
       78.23249199, 69.77339593, 62.31208649, 44.30688124, 79.06567921,
       67.89160877, 51.87449517, 17.56268311, 49.42229534, 56.74312746,
       63.24380317, 45.36827356, 59.07366461, 48.74952759, 74.39034928,
       89.44170169, 67.96653877, 84.11970041, 61.24483127, 62.50491433,
       56.72530542, 26.22567409, 63.09046925, 67.84436935, 70.26057196])


\(\color{skyblue}{\textrm{- Probabilities}}\)

The probability of a specific outcome are extremely low because there are an infinite number of possible outcomes:

sum(score == 80.0) / len(score)
0.0

Instead, look at higher, lower, or a range:

sum((score > 75) & (score < 85)) / len(score)
0.09


\(\color{skyblue}{\textrm{- Distribution}}\)

sns.histplot(score, 
             kde = True, 
             bins = 20, 
             color = 'darkblue')
plt.show()

\(\color{darkblue}{\textbf{Probability Distributions}}\)


\(\color{dodgerblue}{\textbf{Types}}\)

\(\color{skyblue}{\textrm{Binomial}}\)

A single binary event is a Bernoulli distribution, a collection of Bernoulli events is a binomial distribution.

  • The experiment consists of a fixed number of trials (n)
  • Each trial has only two possible outcomes: success and failure
  • The probability of success (p) and the probability of failure (q) are constant throughout the experiment
  • Each trial is independent of every other trial

If true, then for n trials and r successes:

  • \(P[r,n]=\frac{n!}{(n-r)!r!}*p^r*q^{n-r}\)
  • \(\mu=n*p\)
  • \(\sigma=\sqrt{n*p*q}\)
#Eg. filliping a coin 10 times, 10000 times
trials = []

for i in range(0, 10000):
    sample = stats.bernoulli.rvs(p = 0.5, size = 10)
    trials.append(sum(sample))

print(trials[0:100])
[4, 4, 7, 3, 3, 5, 8, 5, 5, 4, 4, 6, 6, 5, 4, 1, 4, 5, 4, 6, 5, 6, 4, 5, 3, 5, 6, 6, 7, 2, 5, 4, 7, 3, 4, 5, 6, 5, 6, 4, 4, 7, 5, 7, 3, 5, 6, 5, 4, 2, 3, 3, 3, 5, 4, 4, 6, 4, 3, 6, 6, 6, 5, 4, 7, 3, 5, 4, 5, 8, 4, 6, 5, 2, 4, 5, 5, 4, 2, 6, 7, 5, 4, 2, 7, 7, 6, 7, 4, 6, 2, 5, 5, 6, 6, 5, 5, 6, 6, 9]
#Expected Mean = n * p = 10 * 0.5 = 5
np.mean(trials)
4.9915
#Expected Standard Deviation = sqrt(n * p * (1 - p)) = sqrt(10 * 0.5 * (1 - 0.5)) = 1.58
np.std(trials)
1.5848115818607587
sns.histplot(trials,
             color = 'darkblue',
             bins = 10)

plt.axvline(np.mean(trials), color = 'r', ls = '--', lw = 2.0)
plt.axvline(np.mean(trials) - np.std(trials), color = 'g', ls = '--', lw = 2.0)
plt.axvline(np.mean(trials) + np.std(trials), color = 'g', ls = '--', lw = 2.0)

plt.show()


\(\color{skyblue}{\textrm{Poisson}}\)

  • The experiment consists of counting the number of occurrences over a period or space
  • The mean has to be the same for each interval of measurement (e.g. each hour or mile)
  • The number of occurrences during one interval is independent of all other intervals

If true, then x given λ events in the past:

  • \(P[x]=\frac{\lambda^x*e^{-\lambda}}{x!}\)
  • \(\mu=\lambda\)
  • \(\sigma=\sqrt{\lambda}\)
#Eg. 25 hurricanes in the past 22 years

lam = 25.0 / 22

distribution = stats.poisson.pmf(range(0, 11), mu = lam)
sns.barplot(x = [0,1,2,3,4,5,6,7,8,9,10], 
            y = distribution,
            color = 'darkblue')

plt.axvline(lam, color = 'r', ls = '--', lw = 2.0)
plt.axvline(lam - np.sqrt(lam), color = 'g', ls = '--', lw = 2.0)
plt.axvline(lam + np.sqrt(lam), color = 'g', ls = '--', lw = 2.0)

plt.show()

#Probability of only 1
stats.poisson.pmf(1, mu = lam)
0.364754678578128
#Probability of less than 3
sum(stats.poisson.pmf(range(0, 3), mu = lam))
0.892985772191726
#Probability of more than 2
1 - sum(stats.poisson.pmf(range(0, 2), mu = lam))
0.31426120427311943


\(\color{skyblue}{\textrm{Continuous}}\)

  • Continuous variables
x = np.array(range(-500,501)) * 0.01

plt.plot(x, stats.norm.pdf(x, 0, 1))
plt.title('PDF (probability density function) of a standard normal distribution')
plt.show()

plt.plot(x,stats.norm.cdf(x, 0, 1))
plt.title('CDF (cumulative distribution function) of a standard normal distribution')
plt.show()


Impact of σ

x = np.array(range(0, 2000)) * 0.01
mu = 10

fig, ax = plt.subplots(figsize=(7.5, 5))
for sigma in range(1,6):
    ax.plot(x, stats.norm.pdf(x, mu, sigma),label='sigma={0}'.format(sigma))
ax.axvline(mu, color = 'r', ls = '--', lw = 2.0)
ax.legend()
ax.set_title('PDF of a normal distribution')

fig, ax = plt.subplots(figsize=(7.5, 5))
for sigma in range(1,6):
    ax.plot(x,stats.norm.cdf(x, mu, sigma), label = 'sigma={0}'.format(sigma))
ax.legend()
ax.set_title('CDF of a normal distribution')


Percentiles

\(Z=\frac{x-\mu}{\sigma}\)


Example

sample = np.random.normal(loc=60, scale=15, size=1000)
sample.mean()
59.63032031988109
sample.std()
15.717177638571053
sns.histplot(sample, 
             bins = 20, 
             color = 'blue',
             stat='density')

sns.kdeplot(sample, 
            color = 'darkblue',
            linewidth = 4)

plt.axvline(sample.mean(), color = 'r', ls = '--', lw = 2.0)
plt.axvline(sample.mean() - sample.std(), color = 'g', ls = '--', lw = 2.0)
plt.axvline(sample.mean() + sample.std(), color = 'g', ls = '--', lw = 2.0)

plt.show()

# Above Mean, Right: Probability > 80
1 - norm.cdf(80, loc = sample.mean(), scale = sample.std())
0.09748535909394129
# Above Mean, Left: Probability < 80
norm.cdf(80, loc = sample.mean(), scale = sample.std())
0.9025146409060587
# Below Mean, Left: Probability < 40
norm.cdf(40, loc = sample.mean(), scale = sample.std())
0.10583759323323055
# Below Mean, Right: Probability > 40
1 - norm.cdf(40, loc = sample.mean(), scale = sample.std())
0.8941624067667695
# Middle: Probability 40 < x < 80
norm.cdf(80, loc = sample.mean(), scale = sample.std()) - norm.cdf(40, loc = sample.mean(), scale = sample.std())
0.7966770476728282

\(\color{dodgerblue}{\textbf{Fitting}}\)

def distribution_analysis(x, log_scale = False, fit_distribution = 'None', bins = 50, vis_means = True):
    #x - array of observations
    #log_scale - analyze distribution of log(x) if True
    #fit_distribution - fit the distribution ('normal', 'gev' or 'pareto') or do nothing if 'None'
    #bins - how many bins to use for binning the data
    #vis_means - show mean and std lines if True
    
    if log_scale: 
        x1 = np.log10(x) #convert data to decimal logarithms
        xlabel = 'log(values)' #reflect in x labels
    else:
        x1 = x #leave original scale 
        xlabel = 'values'    
        
    mu = x1.mean() #compute the mean
    if log_scale: #if logscale, output log mean, its original scale, and original scale mean
        print('Log mean = {:.2f}({:.2f}), mean = {:.2f}'.format(mu, 10**mu, x.mean()))
    else:
        print('Mean = {:.2f}'.format(mu)) #otherwise print mean
    
    sigma = x1.std() #compute and output standard deviation 
    print('Standard deviation = {:.2f}'.format(sigma))
        
    #visualize histogram and the interpolated line using seaborn
    sns.histplot(x1, bins = bins, color = 'darkblue', stat = 'density')
    
    sns.kdeplot(x1, color = 'darkblue', linewidth = 4)
    
    #show vertical lines for mean and std if vis_means = True
    if vis_means:
        plt.axvline(mu, color = 'r', ls = '--', lw=2.0)
        plt.axvline(mu-sigma, color = 'g', ls = '--', lw = 2.0)
        plt.axvline(mu+sigma, color = 'g', ls = '--', lw = 2.0)
        
    ylim = plt.gca().get_ylim() #keep the y-range of original distribution density values 
    #(to make sure the fitted distribution would not affect it)
    
    h = np.arange(mu - 3 * sigma, mu + 3 * sigma, sigma / 100) #3-sigma visualization range for the fitted distribution
    pars = None #fitted distribution parameters
    
    #fit and visualize the theoretic distribution
    if fit_distribution == 'normal':
        pars = norm.fit(x1)
        plt.plot(h,norm.pdf(h,*pars),'r')
    elif fit_distribution == 'gev':
        pars = gev.fit(x1)
        plt.plot(h,gev.pdf(h,*pars),'r')
    elif fit_distribution == 'pareto':
        pars = pareto.fit(x1)
        plt.plot(h,pareto.pdf(h,*pars),'r')
    
    plt.xlabel(xlabel) #add x label 
    plt.ylim(ylim) #restore the y-range of original distribution density values 
    plt.show()
distribution_analysis(sample)
Mean = 59.63
Standard deviation = 15.72

distribution_analysis(sample, fit_distribution = "normal")
Mean = 59.63
Standard deviation = 15.72

distribution_analysis(sample, fit_distribution = "pareto")
Mean = 59.63
Standard deviation = 15.72

\(\color{darkblue}{\textbf{Inferential Statistics}}\)

from  gapminder import gapminder

asia_07 = gapminder[(gapminder["continent"] == "Asia") & (gapminder["year"] == 2007)]["lifeExp"].sort_values()
africa_07 = gapminder[(gapminder["continent"] == "Africa") & (gapminder["year"] == 2007)]["lifeExp"].sort_values()

\(\color{dodgerblue}{\textbf{Normality Test}}\)

  • \(H_0\) = the sample is normally distributed (p >= 0.05)
  • \(H_1\) = the sample is not normally distributed (p < 0.05)
def is_normal(x, sig=0.05): 
    #check is the distribution is normal using one-sample KS test and sample mean-std
    test_result = stats.kstest(x,'norm', args=(x.mean(),x. std()))
    
    if test_result[1] < sig: 
        print("Reject the null (p = " + str(round(test_result[1], 3)) + "), can't consider normal")
    else:
        print("Fail to reject the null (p = " + str(round(test_result[1], 3)) + "), safe to assume normal")
is_normal(asia_07)
Fail to reject the null (p = 0.324), safe to assume normal
is_normal(africa_07)
Fail to reject the null (p = 0.691), safe to assume normal

\(\color{dodgerblue}{\textbf{T-Test}}\)

  • \(H_0\) = \(\mu_x=\mu_y\)
  • \(H_1\) = \(\mu_x\neq\mu_y\)
sns.histplot(asia_07, 
             bins = 20, 
             color = 'blue',
             stat='density',
             label = "Asia")

sns.kdeplot(asia_07, 
            color = 'darkblue',
            linewidth = 4)

sns.histplot(africa_07, 
             bins = 20, 
             color = 'red',
             stat='density',
             label = "Africa")

sns.kdeplot(africa_07, 
            color = 'darkred',
            linewidth = 4)
plt.legend()
plt.show()

stats.ttest_ind(asia_07, africa_07)
Ttest_indResult(statistic=7.9273954920841065, pvalue=9.074251950172207e-12)

\(\color{dodgerblue}{\textbf{Kolmogorov–Smirnov Test}}\)

  • \(H_0\) = \(\mu_x=\mu_y\)
  • \(H_1\) = \(\mu_x\neq\mu_y\)
from statsmodels.distributions.empirical_distribution import ECDF 

plt.plot(asia_07, 
         stats.norm.cdf(asia_07, asia_07.mean(), asia_07.std()), 
         color = 'blue', 
         label = "Asia")

asia_ecdf = ECDF(asia_07)
plt.plot(asia_ecdf.x, asia_ecdf.y, color = "lightblue")

plt.plot(africa_07, 
         stats.norm.cdf(africa_07, africa_07.mean(), africa_07.std()), 
         color = 'red', 
         label = "Africa")

africa_ecdf = ECDF(africa_07)
plt.plot(africa_ecdf.x, africa_ecdf.y, color = "salmon")

plt.legend()
plt.show()

stats.ks_2samp(asia_07, africa_07)
KstestResult(statistic=0.7389277389277389, pvalue=2.5241475576365247e-11)

Return to Data Page